import numpy as np  # optional
import pandas as pd
from dateutil.parser import parse
from collections import Counter
import re
# 1. How to import pandas and check the version?

print(pd.__version__)
print(pd.show_versions(as_json=True))

# 2. How to create a series from a list, numpy array and dict?
mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))

# Solution
ser1 = pd.Series(mylist)
ser2 = pd.Series(myarr)
ser3 = pd.Series(mydict)
print(ser1)
print(ser2)
print(ser3)
print(ser3.head())

# 3. How to convert the index of a series into a column of a dataframe?
mylist = list('abcedfghijklmnopqrstuvwxyz')
myarr = np.arange(26)
mydict = dict(zip(mylist, myarr))
ser = pd.Series(mydict)

# Solution
df = ser.to_frame().reset_index()
print(df.head())

# 4. How to combine many series to form a dataframe?
ser1 = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))
ser2 = pd.Series(np.arange(26))
# Solution 1
df = pd.concat([ser1, ser2], axis=1)
# Solution 2
df = pd.DataFrame({'col1': ser1, 'col2': ser2})
print(df.head())

# 5. How to assign name to the series’ index?
ser = pd.Series(list('abcedfghijklmnopqrstuvwxyz'))
# Solution
ser.name = 'alphabets'
print(ser.head())


# 6. How to get the items of series A not present in series B?
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])

# Solution
print(ser1[~ser1.isin(ser2)])


# 7. How to get the items not common to both series A and series B?
ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])

# Solution
ser_u = pd.Series(np.union1d(ser1, ser2))  # union
ser_i = pd.Series(np.intersect1d(ser1, ser2))  # intersect
print(ser_u[~ser_u.isin(ser_i)])


# 8. How to get the minimum, 25th percentile, median, 75th, and max of a numeric series?
state = np.random.RandomState(100)
ser = pd.Series(state.normal(0, 5, 25)) # loc, scale, size
print(ser.head(10))
print(ser)
print(ser.size)
# Solution
print(np.percentile(ser, q=[0, 25, 50, 75, 100]))

# 9. How to get frequency counts of unique items of a series?
ser = pd.Series(np.take(list('abcdefgh我'), np.random.randint(9, size=30)))
print(np.random.randint(8, size=30))
print(ser)
# Solution
print(ser.value_counts())


# 10. How to keep only top 2 most frequent values as it is and replace everything else as ‘Other’?
np.random.RandomState(100)
ser = pd.Series(np.random.randint(1, 5, [12]))
# Solution
print("Top 2 Freq:", ser.value_counts())
ser[~ser.isin(ser.value_counts().index[:2])] = 'Other'
print(ser)


# 11. How to bin a numeric series to 10 groups of equal size?
ser = pd.Series(np.random.random(20))
print(ser)
# Solution
news = pd.qcut(ser, q=[0, .10, .20, .3, .4, .5, .6, .7, .8, .9, 1], 
        labels=['1st', '2nd', '3rd', '4th', '5th', '6th', '7th', '8th', '9th', '10th']).head()
print(news)
print(news.dtypes)
print(news.cat.categories)



# 12. How to convert a numpy array to a dataframe of given shape? (L1)
ser = pd.Series(np.random.randint(1, 10, 35))
# Solution
df = pd.DataFrame(ser.values.reshape(7,5))
print(df)

# 13. How to find the positions of numbers that are multiples of 3 from a series?
ser = pd.Series(np.random.randint(1, 10, 7))
print(ser)
# Solution
print(((ser%3==0)))
# np.argwhere((ser%3==0)) #错误！！！


# 14. How to extract items at given positions from a series
ser = pd.Series(list('abcdefghijklmnopqrstuvwxyz'))
pos = [0, 4, 8, 14, 20]
# Solution
ser.take(pos)


# 15. How to stack two series vertically and horizontally ?
ser1 = pd.Series(range(5))
ser2 = pd.Series(list('abcde'))

# Output
# Vertical
ser1.append(ser2)
# Horizontal
df = pd.concat([ser1, ser2], axis=1)
print(df)


# 16. How to get the positions of items of series A in another series B?
ser1 = pd.Series([10, 9, 6, 5, 3, 1, 12, 8, 13])
ser2 = pd.Series([1, 3, 10, 13])
# Solution 1
print([np.where(i == ser1)[0].tolist()[0] for i in ser2])
# Solution 2
print([pd.Index(ser1).get_loc(i) for i in ser2])


# 17. How to compute the mean squared error on a truth and predicted series?
truth = pd.Series(range(10))
pred = pd.Series(range(10)) + np.random.random(10)

np.mean((truth-pred)**2)

# 18. How to convert the first character of each element in a series to uppercase?
ser = pd.Series(['how', 'to', 'kick', 'ass?'])

    # Solution 1
ser.map(lambda x: x.title())
    # Solution 2
ser.map(lambda x: x[0].upper() + x[1:])
    # Solution 3
pd.Series([i.title() for i in ser])

# 19. How to calculate the number of characters in each word in a series?
ser = pd.Series(['how', 'to', 'kick', 'ass?'])

    # Solution
ser.map(lambda x: len(x))

# 20. How to compute difference of differences between consequtive numbers of a series?
ser = pd.Series([1, 3, 6, 10, 15, 21, 27, 35])
# Solution
print(ser.diff().tolist())
print(ser.diff().diff().tolist())

# 21. How to convert a series of date-strings to a timeseries?
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])

# Solution 1
from dateutil.parser import parse
ser.map(lambda x: parse(x))

# Solution 2
pd.to_datetime(ser)

# 22. How to get the day of month, week number, day of year and day of week from a series of date strings?
ser = pd.Series(['01 Jan 2010', '02-02-2011', '20120303', '2013/04/04', '2014-05-05', '2015-06-06T12:20'])

# Solution
from dateutil.parser import parse
ser_ts = ser.map(lambda x: parse(x))

# day of month
print("Date: ", ser_ts.dt.day.tolist())

# week number
print("Week number: ", ser_ts.dt.weekofyear.tolist())

# day of year
print("Day number of year: ", ser_ts.dt.dayofyear.tolist())

# day of week
print("Day of week: ", ser_ts.dt.dayofweek.tolist())
print("Day of week: ", ser_ts.dt.weekday.tolist())

# 23. How to convert year-month string to dates corresponding to the 4th day of the month?
ser = pd.Series(['Jan 2010', 'Feb 2011', 'Mar 2012'])
# Solution 1
# Parse the date
ser_ts = ser.map(lambda x: parse(x))
# Construct date string with date as 4
ser_datestr = ser_ts.dt.year.astype('str') + '-' + ser_ts.dt.month.astype('str') + '-' + '04'
# Format it.
[parse(i).strftime('%Y-%m-%d') for i in ser_datestr]
# Solution 2
ser.map(lambda x: parse('04 ' + x))


# 24. How to filter words that contain atleast 2 vowels from a series?
ser = pd.Series(['Apple', 'Orange', 'Plan', 'Python', 'Money'])
# Solution
mask = ser.map(lambda x: sum([Counter(x.lower()).get(i, 0) for i in list('aeiou')]) >= 2)
print(ser[mask])


# 25. How to filter valid emails from a series?
emails = pd.Series(['buying books at amazom.com', 'rameses@egypt.com', 'matt@t.co', 'narendra@modi.com'])
# Solution 1 (as series of strings)
pattern ='[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,4}'
mask = emails.map(lambda x: bool(re.match(pattern, x)))
emails[mask]

# Solution 2 (as series of list)
emails.str.findall(pattern, flags=re.IGNORECASE)

# Solution 3 (as list)
atest = [x[0] for x in [re.findall(pattern, email) for email in emails] if len(x) > 0]
print(atest)

# 26. How to get the mean of a series grouped by another series?
fruit = pd.Series(np.random.choice(['apple', 'banana', 'carrot'], 10))
weights = pd.Series(np.linspace(1, 10, 10))

# Solution
weights.groupby(fruit).mean()


# 27. How to compute the euclidean distance between two series?
p = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
q = pd.Series([10, 9, 8, 7, 6, 5, 4, 3, 2, 1])

# Solution 
sum((p - q)**2)**.5

# Solution (using func)
np.linalg.norm(p-q)

# 28. How to find all the local maxima (or peaks) in a numeric series?
ser = pd.Series([2, 10, 3, 4, 9, 10, 2, 7, 3])

# Solution
dd = np.diff(np.sign(np.diff(ser)))
peak_locs = np.where(dd == -2)[0] + 1
peak_locs

# 29. How to replace missing spaces in a string with the least frequent character?
my_str = 'dbc deb abed gade'

# Solution
ser = pd.Series(list('dbc deb abed gade'))
freq = ser.value_counts()
print(freq)
least_freq = freq.dropna().index[-1]
"".join(ser.replace(' ', least_freq))


# 30. How to create a TimeSeries starting ‘2000-01-01’ and 10 weekends (saturdays) after that having random numbers as values?
ser = pd.Series(np.random.randint(1,10,10), pd.date_range('2000-01-01', periods=10, freq='W-SAT'))
print(ser)